cohort analysis

安装量: 112
排名: #7654

安装

npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'Cohort Analysis'
Cohort Analysis
Overview
Cohort analysis tracks groups of users with shared characteristics over time, revealing patterns in retention, engagement, and lifetime value.
When to Use
Measuring user retention rates and identifying when users churn
Analyzing customer lifetime value (LTV) and payback periods
Comparing performance across different user acquisition channels or campaigns
Understanding how product changes affect different user groups over time
Tracking engagement patterns and identifying early warning signs of churn
Evaluating the long-term impact of onboarding improvements or feature releases
Core Concepts
Cohort
Group of users sharing a characteristic (signup date, region, etc.)
Cohort Size
Initial group size
Retention Rate
Percentage remaining active
Churn Rate
Percentage who left
Retention Curve
How cohort degrades over time
Cohort Types
Acquisition Date
Users grouped by signup period
Behavioral
Users grouped by actions taken
Revenue
Users grouped by purchase value
Geographic
Users grouped by location
Demographic
Users grouped by characteristics Implementation with Python import pandas as pd import numpy as np import matplotlib . pyplot as plt import seaborn as sns

Create sample user lifecycle data

np . random . seed ( 42 )

Generate user data

n_users

5000 users = [ ] for user_id in range ( n_users ) : signup_month = np . random . choice ( range ( 1 , 13 ) ) lifetime_months = np . random . poisson ( 6 ) + 1 for month in range ( 1 , lifetime_months + 1 ) : users . append ( { 'user_id' : user_id , 'signup_month' : signup_month , 'month' : month , 'active' : 1 , } ) df = pd . DataFrame ( users )

Add derived columns

df [ 'cohort_month' ] = df [ 'signup_month' ] df [ 'cohort_age' ] = df [ 'month' ]

Could be day, week, etc.

df [ 'date' ] = pd . to_datetime ( '2023-01-01' ) + pd . to_timedelta ( df [ 'signup_month' ] * 30 , unit = 'D' ) print ( "User Data Summary:" ) print ( df . head ( 10 ) )

1. Cohort Table (Retention Matrix)

cohort_data

df . groupby ( [ 'cohort_month' , 'cohort_age' ] ) . agg ( { 'user_id' : 'nunique' } ) . reset_index ( ) cohort_data . columns = [ 'cohort_month' , 'cohort_age' , 'unique_users' ]

Create pivot table

cohort_pivot

cohort_data . pivot ( index = 'cohort_month' , columns = 'cohort_age' , values = 'unique_users' ) print ( "\nCohort Sizes (Raw User Counts):" ) print ( cohort_pivot )

2. Cohort Retention (as percentage of cohort size)

cohort_size

cohort_pivot . iloc [ : , 0 ] retention_table = cohort_pivot . divide ( cohort_size , axis = 0 ) * 100 print ( "\nCohort Retention Rate (%):" ) print ( retention_table . round ( 1 ) )

3. Visualize Retention Matrix

fig , axes = plt . subplots ( 2 , 1 , figsize = ( 14 , 8 ) )

Heatmap of raw counts

sns . heatmap ( cohort_pivot , annot = True , fmt = 'g' , cmap = 'YlOrRd' , ax = axes [ 0 ] , cbar_kws = { 'label' : 'User Count' } ) axes [ 0 ] . set_title ( 'Cohort Sizes - User Counts' ) axes [ 0 ] . set_xlabel ( 'Cohort Age (Months)' ) axes [ 0 ] . set_ylabel ( 'Cohort Month' )

Heatmap of retention rates

sns . heatmap ( retention_table , annot = True , fmt = '.0f' , cmap = 'RdYlGn' , vmin = 0 , vmax = 100 , ax = axes [ 1 ] , cbar_kws = { 'label' : 'Retention %' } ) axes [ 1 ] . set_title ( 'Cohort Retention Rates (%)' ) axes [ 1 ] . set_xlabel ( 'Cohort Age (Months)' ) axes [ 1 ] . set_ylabel ( 'Cohort Month' ) plt . tight_layout ( ) plt . show ( )

4. Retention Curve

fig , ax = plt . subplots ( figsize = ( 12 , 6 ) )

Plot retention curves for each cohort

for cohort_month in cohort_pivot . index [ : 8 ] :

First 8 cohorts

cohort_retention

retention_table . loc [ cohort_month ] ax . plot ( cohort_retention . index , cohort_retention . values , marker = 'o' , label = f'Cohort { cohort_month } ' ) ax . set_xlabel ( 'Cohort Age (Months)' ) ax . set_ylabel ( 'Retention Rate (%)' ) ax . set_title ( 'Retention Curves by Cohort' ) ax . legend ( bbox_to_anchor = ( 1.05 , 1 ) , loc = 'upper left' ) ax . grid ( True , alpha = 0.3 ) ax . set_ylim ( [ 0 , 105 ] ) plt . tight_layout ( ) plt . show ( )

5. Average Retention Curve

fig , ax = plt . subplots ( figsize = ( 10 , 6 ) )

Calculate average retention at each age

avg_retention

retention_table . mean ( ) ax . plot ( avg_retention . index , avg_retention . values , marker = 'o' , linewidth = 2 , markersize = 8 , color = 'navy' ) ax . fill_between ( avg_retention . index , avg_retention . values , alpha = 0.3 , color = 'navy' )

Add confidence interval

std_retention

retention_table . std ( ) ax . fill_between ( std_retention . index , avg_retention - std_retention , avg_retention + std_retention , alpha = 0.2 , color = 'navy' , label = '±1 Std Dev' ) ax . set_xlabel ( 'Cohort Age (Months)' ) ax . set_ylabel ( 'Retention Rate (%)' ) ax . set_title ( 'Average Retention Curve with Confidence Band' ) ax . legend ( ) ax . grid ( True , alpha = 0.3 ) ax . set_ylim ( [ 0 , 105 ] ) plt . tight_layout ( ) plt . show ( )

6. Churn Rate

churn_rate

100

retention_table print ( "\nChurn Rates (%):" ) print ( churn_rate . round ( 1 ) . head ( ) )

7. Revenue Cohort Analysis

Add revenue data

np . random . seed ( 42 ) df [ 'revenue' ] = np . random . exponential ( 50 , len ( df ) )

Revenue by cohort

revenue_data

df . groupby ( [ 'cohort_month' , 'cohort_age' ] ) . agg ( { 'revenue' : 'sum' , 'user_id' : 'nunique' } ) . reset_index ( ) revenue_data [ 'revenue_per_user' ] = revenue_data [ 'revenue' ] / revenue_data [ 'user_id' ] revenue_pivot = revenue_data . pivot ( index = 'cohort_month' , columns = 'cohort_age' , values = 'revenue' ) rpu_pivot = revenue_data . pivot ( index = 'cohort_month' , columns = 'cohort_age' , values = 'revenue_per_user' )

Visualize revenue

fig , axes = plt . subplots ( 2 , 1 , figsize = ( 14 , 8 ) ) sns . heatmap ( revenue_pivot , annot = True , fmt = '.0f' , cmap = 'YlGnBu' , ax = axes [ 0 ] , cbar_kws = { 'label' : 'Total Revenue ($)' } ) axes [ 0 ] . set_title ( 'Total Revenue by Cohort' ) axes [ 0 ] . set_xlabel ( 'Cohort Age (Months)' ) axes [ 0 ] . set_ylabel ( 'Cohort Month' ) sns . heatmap ( rpu_pivot , annot = True , fmt = '.2f' , cmap = 'YlGnBu' , ax = axes [ 1 ] , cbar_kws = { 'label' : 'Revenue per User ($)' } ) axes [ 1 ] . set_title ( 'Revenue per User by Cohort' ) axes [ 1 ] . set_xlabel ( 'Cohort Age (Months)' ) axes [ 1 ] . set_ylabel ( 'Cohort Month' ) plt . tight_layout ( ) plt . show ( )

8. Lifetime Value Calculation

df [ 'month_since_signup' ] = df [ 'cohort_age' ] ltv_data = df . groupby ( 'user_id' ) . agg ( { 'revenue' : 'sum' , 'cohort_month' : 'first' , 'month_since_signup' : 'max' , } ) . reset_index ( ) ltv_data . columns = [ 'user_id' , 'lifetime_value' , 'cohort_month' , 'lifetime_months' ]

Average LTV by cohort

ltv_by_cohort

ltv_data . groupby ( 'cohort_month' ) [ 'lifetime_value' ] . agg ( [ 'mean' , 'median' , 'std' ] ) print ( "\nLifetime Value by Cohort:" ) print ( ltv_by_cohort . round ( 2 ) ) fig , ax = plt . subplots ( figsize = ( 10 , 6 ) ) ltv_by_cohort [ 'mean' ] . plot ( kind = 'bar' , ax = ax , color = 'skyblue' , edgecolor = 'black' ) ax . set_title ( 'Average Lifetime Value by Cohort' ) ax . set_xlabel ( 'Cohort Month' ) ax . set_ylabel ( 'Lifetime Value ($)' ) ax . grid ( True , alpha = 0.3 , axis = 'y' ) plt . tight_layout ( ) plt . show ( )

9. Cohort Composition Over Time

fig , ax = plt . subplots ( figsize = ( 12 , 6 ) )

Active users per month by cohort

active_by_month

df . groupby ( [ 'date' , 'cohort_month' ] ) . size ( ) . reset_index ( name = 'active_users' ) pivot_active = active_by_month . pivot ( index = 'date' , columns = 'cohort_month' , values = 'active_users' ) pivot_active . plot ( ax = ax , marker = 'o' ) ax . set_title ( 'Active Users Per Month by Cohort' ) ax . set_xlabel ( 'Month' ) ax . set_ylabel ( 'Active Users' ) ax . legend ( title = 'Cohort Month' , bbox_to_anchor = ( 1.05 , 1 ) ) ax . grid ( True , alpha = 0.3 ) plt . tight_layout ( ) plt . show ( )

10. Cohort Summary Metrics

summary_metrics

pd . DataFrame ( { 'Cohort Month' : cohort_size . index , 'Initial Size' : cohort_size . values , 'Month 1 Retention' : retention_table . iloc [ : , 0 ] . values , 'Month 3 Retention' : retention_table . iloc [ : , min ( 2 , retention_table . shape [ 1 ] - 1 ) ] . values , 'Avg LTV' : ltv_by_cohort [ 'mean' ] . values , } ) print ( "\nCohort Summary Metrics:" ) print ( summary_metrics . round ( 2 ) )

11. Visualization comparison

fig , axes = plt . subplots ( 1 , 3 , figsize = ( 15 , 4 ) )

Month 1 vs Month 3 retention

ax_plot

axes [ 0 ] months = [ 'Month 1' , 'Month 3' ] month_1_ret = retention_table . iloc [ : , 0 ] . mean ( ) month_3_ret = retention_table . iloc [ : , min ( 2 , retention_table . shape [ 1 ] - 1 ) ] . mean ( ) ax_plot . bar ( months , [ month_1_ret , month_3_ret ] , color = [ '#1f77b4' , '#ff7f0e' ] , edgecolor = 'black' ) ax_plot . set_ylabel ( 'Retention Rate (%)' ) ax_plot . set_title ( 'Average Retention by Milestone' ) ax_plot . set_ylim ( [ 0 , 100 ] ) for i , v in enumerate ( [ month_1_ret , month_3_ret ] ) : ax_plot . text ( i , v + 2 , f' { v : .1f } %' , ha = 'center' )

Cohort size trend

axes [ 1 ] . plot ( cohort_size . index , cohort_size . values , marker = 'o' , linewidth = 2 , markersize = 8 ) axes [ 1 ] . set_xlabel ( 'Cohort Month' ) axes [ 1 ] . set_ylabel ( 'Cohort Size' ) axes [ 1 ] . set_title ( 'Cohort Sizes Over Time' ) axes [ 1 ] . grid ( True , alpha = 0.3 )

LTV trend

axes
[
2
]
.
plot
(
ltv_by_cohort
.
index
,
ltv_by_cohort
[
'mean'
]
.
values
,
marker
=
'o'
,
linewidth
=
2
,
markersize
=
8
,
color
=
'green'
)
axes
[
2
]
.
set_xlabel
(
'Cohort Month'
)
axes
[
2
]
.
set_ylabel
(
'Average Lifetime Value ($)'
)
axes
[
2
]
.
set_title
(
'LTV Trend by Cohort'
)
axes
[
2
]
.
grid
(
True
,
alpha
=
0.3
)
plt
.
tight_layout
(
)
plt
.
show
(
)
print
(
"\nCohort analysis complete!"
)
Key Metrics
Retention Rate
% of cohort active
Churn Rate
% of cohort lost
Day/Month 1 Retention
Early engagement
Lifetime Value
Total revenue per user
Payback Period
Time to recover CAC Insights to Look For Early retention predictors Differences between cohorts Seasonal patterns Engagement degradation Revenue trends Deliverables Cohort retention matrix Retention curve visualization Churn rate analysis Lifetime value calculations Revenue per cohort Executive summary with insights Actionable recommendations
返回排行榜